This R script is used to validate the points in the ReSurvey database using RS indicators (NDVI, NDMI, canopy height).
library(tidyverse)
library(here)
library(gridExtra)
G3;
Adjuntando el paquete: ‘gridExtra’
gG3;The following object is masked from ‘package:dplyr’:
combine
g
library(readxl)
library(scales)
G3;
Adjuntando el paquete: ‘scales’
gG3;The following object is masked from ‘package:purrr’:
discard
gG3;The following object is masked from ‘package:readr’:
col_factor
g
library(sf)
G3;Linking to GEOS 3.13.1, GDAL 3.10.2, PROJ 9.5.1; sf_use_s2() is TRUE
g
library(rnaturalearth)
library(dtplyr)
G3;Registered S3 method overwritten by 'data.table':
method from
print.data.table
g
library(lme4)
G3;Cargando paquete requerido: Matrix
gG3;
Adjuntando el paquete: ‘Matrix’
gG3;The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
g
library(lmerTest)
G3;
Adjuntando el paquete: ‘lmerTest’
gG3;The following object is masked from ‘package:lme4’:
lmer
gG3;The following object is masked from ‘package:stats’:
step
g
library(car)
G3;Cargando paquete requerido: carData
gG3;
Adjuntando el paquete: ‘car’
gG3;The following object is masked from ‘package:dplyr’:
recode
gG3;The following object is masked from ‘package:purrr’:
some
g
library(ggeffects)
library(party)
G3;Cargando paquete requerido: grid
gG3;Cargando paquete requerido: mvtnorm
gG3;Cargando paquete requerido: modeltools
gG3;Cargando paquete requerido: stats4
gG3;
Adjuntando el paquete: ‘modeltools’
gG3;The following object is masked from ‘package:car’:
Predict
gG3;The following object is masked from ‘package:lme4’:
refit
gG3;Cargando paquete requerido: strucchange
gG3;Cargando paquete requerido: zoo
gG3;
Adjuntando el paquete: ‘zoo’
gG3;The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
gG3;Cargando paquete requerido: sandwich
gG3;
Adjuntando el paquete: ‘strucchange’
gG3;The following object is masked from ‘package:stringr’:
boundary
gG3;
Adjuntando el paquete: ‘party’
gG3;The following object is masked from ‘package:dplyr’:
where
g
library(partykit)
G3;Cargando paquete requerido: libcoin
gG3;
Adjuntando el paquete: ‘partykit’
gG3;The following objects are masked from ‘package:party’:
cforest, ctree, ctree_control, edge_simple, mob, mob_control, node_barplot, node_bivplot, node_boxplot,
node_inner, node_surv, node_terminal, varimp
g
library(strucchange)
library(ggparty)
G3;
Adjuntando el paquete: ‘ggparty’
gG3;The following object is masked from ‘package:ggeffects’:
get_predictions
g
library(caret)
G3;Cargando paquete requerido: lattice
gG3;
Adjuntando el paquete: ‘caret’
gG3;The following object is masked from ‘package:purrr’:
lift
g
library(moreparty)
G3;Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
g
library(randomForest)
G3;randomForest 4.7-1.2
gG3;Type rfNews() to see new features/changes/bug fixes.
gG3;
Adjuntando el paquete: ‘randomForest’
gG3;The following object is masked from ‘package:gridExtra’:
combine
gG3;The following object is masked from ‘package:dplyr’:
combine
gG3;The following object is masked from ‘package:ggplot2’:
margin
g
library(pROC)
G3;Type 'citation("pROC")' for a citation.
gG3;
Adjuntando el paquete: ‘pROC’
gG3;The following objects are masked from ‘package:stats’:
cov, smooth, var
g
printall <- function(tibble) {
print(tibble, width = Inf)
}
source("https://gist.githubusercontent.com/benmarwick/2a1bb0133ff568cbe28d/raw/fb53bd97121f7f9ce947837ef1a4c65a73bffb3f/geom_flat_violin.R")
db_resurv_RS<-read_tsv(
here("data", "clean", "db_resurv_RS_20250523.csv"),
col_types = cols(
# Dynamically specify EUNIS columns as character
.default = col_guess(), # Default guessing for other columns
EUNISa = col_character(),
EUNISb = col_character(),
EUNISc = col_character(),
EUNISd = col_character(),
EUNISa_1 = col_character(),
EUNISa_2 = col_character(),
EUNISa_3 = col_character(),
EUNISa_4 = col_character(),
EUNISb_1 = col_character(),
EUNISb_2 = col_character(),
EUNISb_3 = col_character(),
EUNISb_4 = col_character(),
EUNISc_1 = col_character(),
EUNISc_2 = col_character(),
EUNISc_3 = col_character(),
EUNISc_4 = col_character(),
EUNISd_1 = col_character(),
EUNISd_2 = col_character(),
EUNISd_3 = col_character(),
EUNISd_4 = col_character(),
EUNISa_1_descr = col_character(),
EUNISb_1_descr = col_character(),
EUNISc_1_descr = col_character(),
EUNISd_1_descr = col_character(),
EUNIS_assignation = col_character(),
EUNISa_2_descr = col_character(),
EUNISa_3_descr = col_character(),
EUNISa_4_descr = col_character(),
EUNISb_2_descr = col_character(),
EUNISb_3_descr = col_character(),
EUNISb_4_descr = col_character(),
EUNISc_2_descr = col_character(),
EUNISc_3_descr = col_character(),
EUNISc_4_descr = col_character(),
EUNISd_2_descr = col_character(),
EUNISd_3_descr = col_character(),
EUNISd_4_descr = col_character()
)
)
No parsing issues!
Number of rows where there is more than one EUNIS 1 assigned, and they are different among them. See what to do with these later! So far I take EUNISa_1.
nrow(db_resurv_RS %>%
# Rows with more than one EUNIS 1 assigned
filter(!is.na(EUNISb_1)) %>%
filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1))
[1] 102
See “confusions”:
db_resurv_RS %>%
# Rows with more than one EUNIS 1 assigned
filter(!is.na(EUNISb_1)) %>%
filter(EUNISa_1!=EUNISb_1 | EUNISb_1 != EUNISc_1 | EUNISa_1 != EUNISc_1) %>%
distinct(EUNISa_1, EUNISb_1, EUNISc_1, EUNISd_1)
Define “confusion” columns:
db_resurv_RS <- db_resurv_RS %>%
mutate(EUNIS1_conf_type = case_when(
EUNISa_1 == "R" & EUNISb_1 == "S" ~ "R/S",
EUNISa_1 == "S" & EUNISb_1 == "T" ~ "S/T",
EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" ~ "R/S",
EUNISa_1 == "R" & EUNISb_1 == "R" & EUNISc_1 == "S" & EUNISd_1 == "S" ~ "R/S",
EUNISa_1 == "P" & EUNISb_1 == "Q" ~ "P/Q",
TRUE ~ NA_character_),
EUNIS1_conf = !is.na(EUNIS1_conf_type))
db_resurv_RS_short <- db_resurv_RS %>%
select(PlotObservationID, Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
`Manipulate (y/n)`, `Type of manipulation`, Lon_updated, Lat_updated,
`Location method`, `Location uncertainty (m)`, EUNISa_1,
EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3, EUNISa_3_descr,
EUNISa_4, EUNISa_4_descr, EUNIS1_conf, EUNIS1_conf_type,
date, year, biogeo, unit, year_RS, Lon_RS, Lat_RS,
starts_with("NDVI"), starts_with("NDMI"), starts_with("NDWI"),
starts_with("EVI"), starts_with("SAVI"), canopy_height,
SOS_DOY, SOS_date, NDVI_at_SOS, Peak_DOY, Peak_date, NDVI_at_Peak,
EOS_DOY, EOS_date, NDVI_at_EOS, Season_Length,
S2_data, Landsat_data, CH_data, S2_phen_data)
Do when all RS data is ready!
db_resurv_RS_short <- db_resurv_RS_short %>%
mutate(year_diff = year != year_RS)
db_resurv_RS_short %>% count(year_diff)
None with different year so far.
db_resurv_RS_short <- db_resurv_RS_short %>%
mutate(Lon_diff = case_when(Lon_updated == Lon_RS ~ "NO",
# Sometimes they are only slighly different
abs(Lon_updated - Lon_RS) < 0.01 ~ "SMALL",
is.na(Lon_updated) | is.na(Lon_RS) ~ NA,
TRUE ~ "LARGE"),
Lat_diff = case_when(Lat_updated == Lat_RS ~ "NO",
# Sometimes they are only slighly different
abs(Lat_updated - Lat_RS) < 0.01 ~ "SMALL",
is.na(Lat_updated) | is.na(Lat_RS) ~ NA,
TRUE ~ "LARGE"))
db_resurv_RS_short %>% count(Lon_diff)
db_resurv_RS_short %>% count(Lat_diff)
Very few with large differences (3 for longitude, 5 for latitude). RS indices would need to be calculated again for those.
If year_diff is TRUE or Lon_diff is LARGE or Lat_diff is LARGE –> RS indices need to be recalculated. So far, remove those rows from db_resurv_RS_short.
db_resurv_RS_short <- db_resurv_RS_short %>%
filter(year_diff == FALSE | is.na(year_diff)) %>%
filter(Lon_diff == "NO" |Lon_diff == "SMALL" | is.na(Lon_diff)) %>%
filter(Lat_diff == "NO" |Lat_diff == "SMALL" | is.na(Lat_diff))
Add column PLOT to data to identify unique plots:
db_resurv_RS_short_PLOT <- db_resurv_RS_short %>%
# Original names give problems, create new vars
mutate(RS_site = `ReSurvey site`, RS_plot = `ReSurvey plot`) %>%
# Convert to data.table for faster processing
lazy_dt() %>%
# Group by the 3 vars that uniquely identify each pl ot
group_by(RS_CODE, RS_site, RS_plot) %>%
# Create a new variable PLOT for each group
mutate(PLOT = .GRP) %>%
# Convert back to tibble
as_tibble() %>%
# Remove unneeded vars
select(-RS_site, -RS_plot)
There should be only one observation of each plot per year.
Plots where there is at least a year with more than one observation, and where those observations have a different EUNIS assigned:
plots_to_remove <- db_resurv_RS_short_PLOT %>%
group_by(PLOT, year) %>%
summarize(EUNISa_1_n = n_distinct(EUNISa_1, na.rm = TRUE)) %>%
ungroup() %>%
filter(EUNISa_1_n > 1) %>%
distinct(PLOT)
`summarise()` has grouped output by 'PLOT'. You can override using the `.groups` argument.
Remove plots_to_remove from the database:
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
anti_join(plots_to_remove, by = "PLOT")
Plots and years where there is more than one observation:
plots_to_merge <- db_resurv_RS_short_PLOT %>%
group_by(PLOT, year) %>%
# Plots that have more than one observation per year
filter(n() > 1) %>%
ungroup() %>%
distinct(PLOT)
Summarize plots_to_merge:
plots_to_merge_summ <- db_resurv_RS_short_PLOT %>%
group_by(PLOT, year) %>%
# Plots that have more than one observation per year
filter(n() > 1) %>%
mutate(obs_num = row_number()) %>%
pivot_wider(
names_from = obs_num,
values_from = c(date, PlotObservationID),
names_prefix = "obs_"
) %>%
arrange(PLOT) %>%
summarize(
across(c(Country, RS_CODE, `ReSurvey site`, `ReSurvey plot`,
`Manipulate (y/n)`, `Type of manipulation`, Lon_updated,
Lat_updated, `Location method`, `Location uncertainty (m)`,
EUNISa_1, EUNISa_1_descr, EUNISa_2, EUNISa_2_descr, EUNISa_3,
EUNISa_3_descr, EUNISa_4, EUNISa_4_descr, EUNIS1_conf,
EUNIS1_conf_type, biogeo, unit, year_RS, Lon_RS, Lat_RS, NDVI_max,
NDVI_median, NDVI_min, NDVI_mode, NDVI_p10, NDVI_p90, NDMI_max,
NDMI_median, NDMI_min, NDMI_mode, NDMI_p10, NDMI_p90, NDWI_max,
NDWI_median, NDWI_min, NDWI_mode, NDWI_p10, NDWI_p90, EVI_max,
EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, SAVI_max,
SAVI_median, SAVI_min, SAVI_mode, SAVI_p10, SAVI_p90,
canopy_height, SOS_DOY, SOS_date, Peak_DOY, Peak_date, EOS_DOY,
EOS_date, S2_data, Landsat_data, CH_data, S2_phen_data, year_diff,
Lon_diff, Lat_diff), first),
across(starts_with("date_obs_"), min),
across(starts_with("PlotObservationID_obs_"), min)
) %>%
ungroup()
`summarise()` has grouped output by 'PLOT'. You can override using the `.groups` argument.
Remove plots_to_merge from the database:
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
anti_join(plots_to_merge, by = "PLOT")
And add plots_to_merge_summ, where each plot and year only has one row:
db_resurv_RS_short_PLOT <- bind_rows(db_resurv_RS_short_PLOT,
plots_to_merge_summ)
Check that there is only one row per plot and per year:
db_resurv_RS_short_PLOT %>%
group_by(PLOT, year) %>%
# Plots that have more than one observation per year
filter(n() > 1)
So, to sum up what I have done:
Save clean file for analyses (to be updated continuously due to updates in ReSurvey database and updates on RS data).
write_tsv(db_resurv_RS_short_PLOT,
here("data", "clean","db_resurv_RS_short_PLOT_20250523.csv"))
# Define a function to create histograms
plot_histogram <- function(data, x_var, x_label) {
ggplot(data %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = !!sym(x_var))) +
geom_histogram(color = "black", fill = "white") +
labs(x = x_label, y = "Frequency") +
theme_bw()
}
# Define a function to create plots with violin + boxplot + points
distr_plot <- function(data, y_vars, y_labels) {
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNIS level 1") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip()
print(p)
}
}
Ranges of min and max:
range(db_resurv_RS_short_PLOT$NDVI_max, na.rm = T)
[1] -0.0652840 0.9977221
range(db_resurv_RS_short_PLOT$NDMI_max, na.rm = T)
[1] -0.1656801 0.9996278
range(db_resurv_RS_short_PLOT$NDWI_max, na.rm = T)
[1] -0.9977221 0.9978094
range(db_resurv_RS_short_PLOT$SAVI_max, na.rm = T)
[1] -0.04495294 0.86572465
range(db_resurv_RS_short_PLOT$EVI_max, na.rm = T)
[1] -3.444331e-02 2.251800e+13
range(db_resurv_RS_short_PLOT$NDVI_min, na.rm = T)
[1] -0.9963834 0.9977221
range(db_resurv_RS_short_PLOT$NDMI_min, na.rm = T)
[1] -0.9988908 0.9995986
range(db_resurv_RS_short_PLOT$NDWI_min, na.rm = T)
[1] -0.99772209 -0.03415906
range(db_resurv_RS_short_PLOT$SAVI_min, na.rm = T)
[1] -0.5697995 0.7867734
range(db_resurv_RS_short_PLOT$EVI_min, na.rm = T)
[1] -43.78472 1.65059
Histograms to check that max and min values are ok:
plot_histogram(db_resurv_RS_short_PLOT, "NDVI_max", "NDVI max")
plot_histogram(db_resurv_RS_short_PLOT, "NDMI_max", "NDMI max")
plot_histogram(db_resurv_RS_short_PLOT, "NDWI_max", "NDWI max")
plot_histogram(db_resurv_RS_short_PLOT, "SAVI_max", "SAVI max")
plot_histogram(db_resurv_RS_short_PLOT %>%
# Some values wrong!
filter(EVI_max <= 1), "EVI_max", "EVI max")
plot_histogram(db_resurv_RS_short_PLOT, "NDVI_min", "NDVI min")
plot_histogram(db_resurv_RS_short_PLOT, "NDMI_min", "NDMI min")
plot_histogram(db_resurv_RS_short_PLOT, "NDWI_min", "NDWI min")
plot_histogram(db_resurv_RS_short_PLOT, "SAVI_min", "SAVI min")
plot_histogram(db_resurv_RS_short_PLOT %>%
# Some values wrong!
filter(EVI_min >= -1 & EVI_min <= 1), "EVI_min", "EVI min")
nrow(db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(EVI_max > 1))
[1] 1594
db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q"))%>%
filter(EVI_max > 1) %>%
count(biogeo, unit)
So far, remove observations with EVI_max > 1 and EVI_min <-1 or > 1 from stuff using EVI values.
Distribution plots:
distr_plot(db_resurv_RS_short_PLOT,
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(db_resurv_RS_short_PLOT,
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
distr_plot(db_resurv_RS_short_PLOT,
c("NDWI_max", "NDWI_p90", "NDWI_min", "NDWI_p10"),
c("NDWI max", "NDWI p90", "NDWI min", "NDWI p10"))
distr_plot(db_resurv_RS_short_PLOT,
c("SAVI_max", "SAVI_p90", "SAVI_min", "SAVI_p10"),
c("SAVI max", "SAVI p90", "SAVI min", "SAVI p10"))
distr_plot(db_resurv_RS_short_PLOT %>%
filter(EVI_max <= 1) %>%
filter(EVI_min >= -1 & EVI_min <= 1),
c("EVI_max", "EVI_p90", "EVI_min", "EVI_p10"),
c("EVI max", "EVI p90", "EVI min", "EVI p10"))
Some weird values –> Check.
distr_plot(db_resurv_RS_short_PLOT, "canopy_height", "Canopy height (m)")
ggplot(db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
mutate(CH_cat =
factor(
case_when(canopy_height == 0 ~ "0 m",
canopy_height > 0 & canopy_height <= 1 ~ "0-1 m",
canopy_height > 1 & canopy_height <=2 ~ "1-2 m",
canopy_height > 2 & canopy_height <=5 ~ "2-5 m",
canopy_height > 5 & canopy_height <=8 ~ "5-8 m",
canopy_height > 8 ~ "> 8 m",
is.na(canopy_height) ~ NA_character_),
levels = c(
"0 m", "0-1 m", "1-2 m", "2-5 m", "5-8 m", "> 8 m"))),
aes(x = EUNISa_1_descr, fill = CH_cat)) +
geom_bar() + theme_bw() + coord_flip() +
scale_y_continuous(labels = label_number()) +
scale_fill_viridis_d(direction = -1) +
labs(x = "EUNIS level 1", fill = "Canopy height") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
theme(legend.position = c(0.8, 0.75),
legend.direction = "vertical")
db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
group_by(EUNISa_1_descr) %>%
summarise(across(canopy_height, list(
mean = mean,
median = median,
sd = sd,
min = min,
max = max
), na.rm = TRUE))
db_resurv_RS_short_PLOT <- db_resurv_RS_short_PLOT %>%
mutate(
# Difference NDVI between Peak and SOS
diff_Peak_SOS = NDVI_at_Peak - NDVI_at_SOS,
# Difference NDVI between Peak and EOS
diff_Peak_EOS = NDVI_at_Peak - NDVI_at_EOS)
ggplot(data = db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
pivot_longer(cols = c(SOS_DOY, Peak_DOY, EOS_DOY), names_to = "name",
values_to = "value"),
aes(x = value)) +
geom_histogram(fill = "white", color = "black") +
facet_grid(biogeo ~ name, scales = "free_y") +
theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
pivot_longer(cols = c(NDVI_at_SOS, NDVI_at_Peak, NDVI_at_EOS),
names_to = "name", values_to = "value"),
aes(x = value)) +
geom_histogram(fill = "white", color = "black") +
facet_grid(biogeo ~ name, scales = "free_y") +
theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
pivot_longer(cols = c(diff_Peak_SOS, diff_Peak_EOS),
names_to = "name", values_to = "value"),
aes(x = value)) +
geom_histogram(fill = "white", color = "black") +
facet_grid(biogeo ~ name, scales = "free_y") +
theme_bw()
ggplot(data = db_resurv_RS_short_PLOT %>%
# Keep only forests, grasslands, shrublands and wetlands
filter(EUNISa_1 %in% c("T", "R", "S", "Q") & S2_phen_data == T) %>%
pivot_longer(cols = c(Season_Length),
names_to = "name", values_to = "value"),
aes(x = value)) +
geom_histogram(fill = "white", color = "black") +
facet_grid(biogeo ~ name, scales = "free_y") +
theme_bw()
distr_plot(db_resurv_RS_short_PLOT,
c("SOS_DOY","Peak_DOY", "EOS_DOY",
"NDVI_at_SOS", "NDVI_at_Peak", "NDVI_at_EOS",
"diff_Peak_SOS","diff_Peak_EOS", "Season_Length"),
c("SOS DOY", "Peak DOY", "EOS DOY",
"NDVI at SOS", "NDVI at Peak", "NDVI at EOS",
"Difference Peak-SOS", "Difference Peak-EOS", "Season Length"))
# Define a function to create plots with violin + boxplot + points
distr_plot_biogeo <- function(data, y_vars, y_labels) {
plots <- list()
for (i in seq_along(y_vars)) {
y_var <- y_vars[[i]]
y_label <- y_labels[[i]]
p <- ggplot(data = data %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")),
aes(x = EUNISa_1_descr, y = !!sym(y_var), fill = EUNISa_1_descr)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8) +
geom_point(aes(y = !!sym(y_var), color = EUNISa_1_descr),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1,
label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = y_label, x = "EUNISa_1_descr") +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
guides(fill = FALSE, color = FALSE) +
theme_bw() + coord_flip() + facet_wrap(~ biogeo)
plots[[y_var]] <- p
}
return(plots)
}
Distribution plots:
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)),
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
$NDVI_max
$NDVI_p90
$NDVI_min
$NDVI_p10
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)),
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
$NDMI_max
$NDMI_p90
$NDMI_min
$NDMI_p10
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)),
c("NDWI_max", "NDWI_p90", "NDWI_min", "NDWI_p10"),
c("NDWI max", "NDWI p90", "NDWI min", "NDWI p10"))
$NDWI_max
$NDWI_p90
$NDWI_min
$NDWI_p10
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)),
c("SAVI_max", "SAVI_p90", "SAVI_min", "SAVI_p10"),
c("SAVI max", "SAVI p90", "SAVI min", "SAVI p10"))
$SAVI_max
$SAVI_p90
$SAVI_min
$SAVI_p10
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)) %>%
filter(EVI_max <= 1) %>%
filter(EVI_min >= -1 & EVI_min <= 1),
c("EVI_max", "EVI_p90", "EVI_min", "EVI_p10"),
c("EVI max", "EVI p90", "EVI min", "EVI p10"))
$EVI_max
$EVI_p90
$EVI_min
$EVI_p10
distr_plot_biogeo(db_resurv_RS_short_PLOT, "canopy_height", "Canopy height (m)")
$canopy_height
In this plot, those with biogeo = NA are those that do not have S2 or Landsat data (and thus biogeo has not been assigned), but have CH data. We should later assign a biogeo based on location.
distr_plot_biogeo(db_resurv_RS_short_PLOT %>% filter(!is.na(biogeo)),
c("SOS_DOY", "Peak_DOY", "EOS_DOY",
"NDVI_at_SOS", "NDVI_at_Peak", "NDVI_at_EOS",
"diff_Peak_SOS", "diff_Peak_EOS", "Season_Length"),
c("SOS DOY", "Peak DOY", "EOS DOY",
"NDVI at SOS", "NDVI at Peak", "NDVI at EOS",
"Difference Peak-SOS", "Difference Peak-EOS",
"Season Length"))
$SOS_DOY
$Peak_DOY
$EOS_DOY
$NDVI_at_SOS
$NDVI_at_Peak
$NDVI_at_EOS
$diff_Peak_SOS
$diff_Peak_EOS
$Season_Length
BOR missing because there is no phenology info for EUNISa_1 %in% c(“T”, “R”, “S”, “Q”).
ERRORS! Bea is checking this:
db_resurv_RS_short_PLOT %>% filter(SOS_DOY > Peak_DOY)
db_resurv_RS_short_PLOT %>% filter(Peak_DOY > EOS_DOY)
db_resurv_RS_short_PLOT %>% filter(SOS_DOY > EOS_DOY)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_SOS)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_at_EOS)
db_resurv_RS_short_PLOT %>% filter(NDVI_at_Peak < NDVI_max)
Summarize variables by plot:
db_resurv_RS_short_PLOT_summ <- db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T) %>%
group_by(PLOT) %>%
summarize(EUNIS1 = if_else(n_distinct(EUNISa_1) > 1, "Change",
unique(EUNISa_1)[1]),
count = n(),
across(starts_with("NDVI"), list(mean = mean, sd = sd),
.names = "{col}_{fn}"))
Maybe use later because now many plots have only one observation, probably because some Landsat data is missing?
For T, R, S, Q habitats.
Define a set of rules for a first validation of ALL ReSurvey data. We can call these “Expert-based” rules.
Number of observations in ReSurvey from the habitats of interest:
nrow(db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")))
[1] 181062
Number of observations in ReSurvey from the habitats of interest and with all RS data:
nrow(db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(CH_data == T) %>%
filter(S2_data == T | Landsat_data ==T) %>%
filter(S2_phen_data == T))
[1] 5663
db_resurv_RS_short_PLOT_terrestrial <- db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q"))
Create column for first validation based on different indicators, where “wrong” is noted when the validation rule is not met. Include EUNIS1 confusions.
db_resurv_RS_short_PLOT_terrestrial %>% count(EUNISa_1, EUNIS1_conf_type)
Define rules:
db_resurv_RS_short_PLOT_terrestrial <-
db_resurv_RS_short_PLOT_terrestrial %>%
mutate(
valid_1_NDWI = case_when(
# Points that are basically water
NDWI_max > 0.3 ~ "wrong",
TRUE ~ NA_character_),
valid_1_CH = case_when(
# T points with low CH
EUNISa_1 == "T" & canopy_height < 8 ~ "wrong",
# S points with low CH
EUNISa_1 =="S" & canopy_height < 5 ~ "wrong",
# R & Q points with high CH
EUNISa_1 %in% c("R", "Q") & canopy_height > 2 ~ "wrong",
TRUE ~ NA_character_),
valid_1_NDVI = case_when(
# T points with low NDVI_max
EUNISa_1 == "T" & NDVI_max < 0.6 ~ "wrong",
# S-R-Q points with low NDVI_max
EUNISa_1 %in% c("R", "S", "Q") & NDVI_max < 0.2 ~ "wrong",
TRUE ~ NA_character_),
# Count how many validation rules are not met
valid_1_count = rowSums(across(c(valid_1_NDWI, valid_1_CH, valid_1_NDVI),
~ . == "wrong"), na.rm = TRUE),
# Points where at least 1 rule not met
valid_1 = if_else(valid_1_count > 0, "At least 1 rule broken",
"No rules broken so far")
)
ggplot(db_resurv_RS_short_PLOT_terrestrial%>%
mutate(rules_broken = case_when(
valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
valid_1_count == 2 &
valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
valid_1_count == 3 ~ "NDWI + NDVI + CH",
TRUE ~ NA_character_
)),
aes(x = valid_1_count, fill = rules_broken)) +
geom_bar() + labs(x = "Number of broken rules")
db_resurv_RS_short_PLOT_terrestrial %>%
mutate(rules_broken = case_when(
valid_1_count == 1 & valid_1_NDWI == "wrong" ~ "NDWI",
valid_1_count == 1 & valid_1_NDVI == "wrong" ~ "NDVI",
valid_1_count == 1 & valid_1_CH == "wrong" ~ "CH",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_NDVI == "wrong"~ "NDWI + NDVI",
valid_1_count == 2 &
valid_1_NDWI == "wrong" & valid_1_CH == "wrong"~ "NDWI + CH",
valid_1_count == 2 &
valid_1_NDVI == "wrong" & valid_1_CH == "wrong"~ "NDVI + CH",
valid_1_count == 3 ~ "NDWI + NDVI + CH",
TRUE ~ NA_character_
)) %>%
count(rules_broken, EUNIS1_conf_type)
Proportion of observations not validated (so far):
nrow(db_resurv_RS_short_PLOT_terrestrial %>% filter(valid_1_count > 0))/
nrow(db_resurv_RS_short_PLOT_terrestrial)
[1] 0.1939004
But be aware that there are still MANY missing RS data.
ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
mutate(diff_GPS = if_else(
`Location method` != "Location with differential GPS" |
is.na(`Location method`), "no", "yes")),
aes(x = diff_GPS, fill = valid_1)) +
geom_bar() + labs(x = "Differential GPS")
ggplot(db_resurv_RS_short_PLOT_terrestrial %>%
mutate(GPS = case_when(
`Location method` == "Location with differential GPS" ~ "yes",
`Location method` == "Location with GPS" ~ "yes",
is.na(`Location method`) ~ "no",
TRUE ~ "no"
)),
aes(x = GPS, fill = valid_1)) +
geom_bar() + labs(x = "GPS")
Points with any rule broken and confusion between EUNIS:
nrow(db_resurv_RS_short_PLOT_terrestrial %>%
filter(EUNIS1_conf == T & valid_1_count > 0))
[1] 10
Convert to shp to look at these in GIS:
# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
# filter(EUNIS1_conf == T & valid_1_count > 0) %>%
# st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
# "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_EUNIS_conf.shp")
Checked and yes
How many points with differential GPS that have at least 1 rule broken?
nrow(db_resurv_RS_short_PLOT_terrestrial %>%
filter(`Location method` == "Location with differential GPS" &
valid_1 == "At least 1 rule broken"))
[1] 1191
Convert to shp to look at these in GIS:
# st_write(db_resurv_RS_short_PLOT_terrestrial %>%
# filter(`Location method` == "Location with differential GPS" &
# valid_1 == "At least 1 rule broken") %>%
# st_as_sf(coords = c("Lon_updated", "Lat_updated"), crs = 4326),
# "C:/GIS/MOTIVATE/shapefiles/resurv_not_val_diff_GPS.shp")
# Load world boundaries
world <- ne_countries(scale = "medium", returnclass = "sf")
# Calculate the extent of the points
points_GPS_extent <- db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ) %>%
filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_GPS_extent$lon_min - padding,
points_GPS_extent$lon_max + padding)
y_limits <- c(points_GPS_extent$lat_min - padding,
points_GPS_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ) %>%
filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS"),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of GPS points by Country:
db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ) %>%
filter(`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS") %>%
count(Country)
# Calculate the extent of the points
points_resurvey_extent <- db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ) %>%
summarise(lon_min = min(Lon_updated, na.rm = TRUE),
lon_max = max(Lon_updated, na.rm = TRUE),
lat_min = min(Lat_updated, na.rm = TRUE),
lat_max = max(Lat_updated, na.rm = TRUE))
# Add padding to the extent (adjust as needed)
padding <- 2 # Adjust padding to your preference
x_limits <- c(points_resurvey_extent$lon_min - padding,
points_resurvey_extent$lon_max + padding)
y_limits <- c(points_resurvey_extent$lat_min - padding,
points_resurvey_extent$lat_max + padding)
# Create the zoomed map
ggplot() +
geom_sf(data = world, fill = "lightblue", color = "gray") +
geom_point(data = db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ),
aes(x = Lon_updated, y = Lat_updated, color = EUNISa_1),
size = 1) +
coord_sf(xlim = x_limits, ylim = y_limits) +
theme_minimal()
Number of ReSurvey points by Country:
db_resurv_RS_short_PLOT %>%
filter(EUNISa_1 %in% c("T", "R", "S", "Q")) %>%
filter(S2_data == T | Landsat_data == T ) %>%
count(Country)
Create tibble with differential GPS points without rules broken so far:
all_GPS_valid <- db_resurv_RS_short_PLOT_terrestrial %>%
filter((`Location method` == "Location with differential GPS" |
`Location method` == "Location with GPS" ) &
valid_1 == "No rules broken so far")
distr_plot(all_GPS_valid,
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(all_GPS_valid,
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
distr_plot(all_GPS_valid,
c("NDWI_max", "NDWI_p90", "NDWI_min", "NDWI_p10"),
c("NDWI max", "NDWI p90", "NDWI min", "NDWI p10"))
distr_plot(all_GPS_valid,
c("SAVI_max", "SAVI_p90", "SAVI_min", "SAVI_p10"),
c("SAVI max", "SAVI p90", "SAVI min", "SAVI p10"))
distr_plot(all_GPS_valid %>%
filter(EVI_max <= 1) %>%
filter(EVI_min >= -1 & EVI_min <= 1),
c("EVI_max", "EVI_p90", "EVI_min", "EVI_p10"),
c("EVI max", "EVI p90", "EVI min", "EVI p10"))
distr_plot(all_GPS_valid, "canopy_height", "Canopy height (m)")
distr_plot(all_GPS_valid,
c("SOS_DOY","Peak_DOY", "EOS_DOY",
"NDVI_at_SOS", "NDVI_at_Peak", "NDVI_at_EOS",
"diff_Peak_SOS","diff_Peak_EOS", "Season_Length"),
c("SOS DOY", "Peak DOY", "EOS DOY",
"NDVI at SOS", "NDVI at Peak", "NDVI at EOS",
"Difference Peak-SOS", "Difference Peak-EOS", "Season Length"))
Chosen NDVI_min because it was important in RF models, but let’s see with new data!
percentiles_all_GPS <- all_GPS_valid %>%
group_by(EUNISa_1) %>%
summarize(percentile_20_NDVI_max = quantile(NDVI_max, probs = 0.20, na.rm = T),
percentile_20_NDMI_min = quantile(NDMI_min, probs = 0.20, na.rm = T))
all_GPS_valid <- all_GPS_valid %>%
left_join(percentiles_all_GPS, by = "EUNISa_1") %>%
mutate(category_NDVI_max = case_when(
NDVI_max < percentile_20_NDVI_max ~ "below_20th",
NDVI_max >= percentile_20_NDVI_max ~ "above_20th"),
category_NDMI_min = case_when(
NDMI_min < percentile_20_NDMI_min ~ "below_20th",
NDMI_min >= percentile_20_NDMI_min ~ "above_20th"))
ggplot(data = all_GPS_valid,
aes(x = EUNISa_1_descr, y = NDVI_max)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
fill = "lightblue") +
geom_point(aes(color = category_NDVI_max),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = "NDVI max", x = "EUNIS level 1") +
guides(fill = FALSE, color = FALSE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
theme_bw() + coord_flip()
ggplot(data = all_GPS_valid,
aes(x = EUNISa_1_descr, y = NDMI_min)) +
geom_flat_violin(position = position_nudge(x = 0.2, y = 0), alpha = 0.8,
fill = "lightblue") +
geom_point(aes(color = category_NDMI_min),
position = position_jitter(width = 0.15), size = 1, alpha = 0.25) +
geom_boxplot(width = 0.2, outlier.shape = NA, alpha = 0.5) +
stat_summary(fun.y = mean, geom = "point", shape = 20, size = 1) +
stat_summary(fun.data = function(x) data.frame(y = max(x) + 0.1, label = length(x)),
geom = "text", aes(label = ..label..), vjust = 0.5) +
labs(y = "NDMI min", x = "EUNIS level 1") +
guides(fill = FALSE, color = FALSE) +
scale_x_discrete(labels = function(x) str_wrap(x, width = 15)) +
scale_color_manual(values = c("below_20th" = "grey", "above_20th" = "lightblue")) +
theme_bw() + coord_flip()
Using the conditional inference version of random forest (cforest in package party). Suggested if the data are highly correlated. Cforest is more stable in deriving variable importance values in the presence of highly correlated variables, thus providing better accuracy in calculating variable importance (ref below).
Hothorn, T., Hornik, K. and Zeileis, A. (2006) Unbiased Recursive Portioning: A Conditional Inference Framework. Journal of Computational and Graphical Statistics, 15, 651- 674. http://dx.doi.org/10.1198/106186006X133933
filtered_data0 <- all_GPS_valid %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices0 <- sample(1:nrow(filtered_data0), 0.7 * nrow(filtered_data0))
train_data0 <- filtered_data0[train_indices0, ]
test_data0 <- filtered_data0[-train_indices0, ]
Number of points per category for filtered data:
filtered_data0 %>% count(EUNISa_1)
rf_cforest0 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data0,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest0 <- predict(rf_cforest0, newdata = test_data0,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest0, test_data0$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 0 0 0 0
R 23 716 0 0
S 0 0 0 0
T 0 0 1 28
Overall Statistics
Accuracy : 0.9688
95% CI : (0.9539, 0.9799)
No Information Rate : 0.9323
P-Value [Acc > NIR] : 6.673e-06
Kappa : 0.6922
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.00000 1.0000 0.000000 1.00000
Specificity 1.00000 0.5577 1.000000 0.99865
Pos Pred Value NaN 0.9689 NaN 0.96552
Neg Pred Value 0.97005 1.0000 0.998698 1.00000
Prevalence 0.02995 0.9323 0.001302 0.03646
Detection Rate 0.00000 0.9323 0.000000 0.03646
Detection Prevalence 0.00000 0.9622 0.000000 0.03776
Balanced Accuracy 0.50000 0.7788 0.500000 0.99932
varimp_rf_cforest0 <- party::varimp(rf_cforest0, conditional = F)
Variable Importance Plot
varimp_rf_cforest0_df <- data.frame(Variable = names(varimp_rf_cforest0),
Importance = varimp_rf_cforest0)
ggplot(varimp_rf_cforest0_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest0, newdata = test_data0, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data0$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc0 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc0
Filter the data to get only GPS-points above p20 of NDVI_max and NDMI_min.
all_GPS_valid <- all_GPS_valid %>%
select(-percentile_20_NDVI_max, -percentile_20_NDMI_min)
percentiles <- all_GPS_valid %>%
group_by(EUNISa_1) %>%
summarize(
percentile_20_NDVI_max = quantile(NDVI_max, 0.20, na.rm = T),
percentile_20_NDMI_min = quantile(NDMI_min, 0.20, na.rm = T),
percentile_80_NDVI_max = quantile(NDVI_max, 0.80, na.rm = T),
percentile_80_NDMI_min = quantile(NDMI_min, 0.80, na.rm = T)
)
# Join the percentiles back to the original data
all_GPS_valid <- all_GPS_valid %>%
left_join(percentiles, by = "EUNISa_1")
# Filter rows above the 20th percentile for both variables for each category of EUNISa_1
filtered_data1 <- all_GPS_valid %>%
filter(
NDVI_max >= percentile_20_NDVI_max & NDMI_min >= percentile_20_NDMI_min
) %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices1 <- sample(1:nrow(filtered_data1), 0.7 * nrow(filtered_data1))
train_data1 <- filtered_data1[train_indices1, ]
test_data1 <- filtered_data1[-train_indices1, ]
Number of points per category for filtered data:
filtered_data1 %>% count(EUNISa_1)
Investigate package ggparty (e.g. autoplot function, and more).
TO-DO: Choose the hyperparameter mtry based on the square root of the number of predictor variables (Hastie et al., 2009)-
Hastie, T., Tibshirani, R., & Friedman, J. (2009). The elements of statistical learning: Data mining, inference, and prediction. Springer Science & Business Media.
Maybe TO_DO: We variated ntree from 50 to 800 in steps of 50, leaving mtry constant at 2. Tis parameter variation showed that ntree=500 was optimal, while higher ntree led to no further model improvement (Supplementary Fig. S10). Subsequently, the hyperparameter mtry was varied from 2 to 8 with constant ntree=500. Here, mtry=3 led to the best results in almost all cases (Supplementary Fig. S11). Consequently, we chose ntree=500 and mtry=3 for our main analysis across all study sites.
rf_cforest1 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data1,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest1 <- predict(rf_cforest1, newdata = test_data1,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest1, test_data1$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 632 325 0 0
R 247 1491 0 0
S 0 0 45 3
T 0 0 7 77
Overall Statistics
Accuracy : 0.7941
95% CI : (0.7787, 0.8089)
No Information Rate : 0.6424
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.5872
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.7190 0.8210 0.86538 0.96250
Specificity 0.8332 0.7557 0.99892 0.99745
Pos Pred Value 0.6604 0.8579 0.93750 0.91667
Neg Pred Value 0.8679 0.7016 0.99748 0.99891
Prevalence 0.3109 0.6424 0.01839 0.02830
Detection Rate 0.2236 0.5274 0.01592 0.02724
Detection Prevalence 0.3385 0.6148 0.01698 0.02971
Balanced Accuracy 0.7761 0.7884 0.93215 0.97998
SurrogateTree –> does not work
varimp_rf_cforest1 <- party::varimp(rf_cforest1, conditional = F)
Variable Importance Plot
varimp_rf_cforest1_df <- data.frame(Variable = names(varimp_rf_cforest1),
Importance = varimp_rf_cforest1)
ggplot(varimp_rf_cforest1_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
Tree Visualization
# Create a single conditional inference tree using ctree
single_tree1 <- ctree(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max + NDMI_min +
NDWI_max + NDWI_min + EVI_max + EVI_min + SAVI_max +
SAVI_min + canopy_height,
data = train_data1)
# Plot the single tree using
autoplot(single_tree1)
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest1, newdata = test_data1, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data1$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc1 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc1
Filter the data to get only GPS-points within IQ range of NDVI_max and NDMI_min.
IQ_ranges <- all_GPS_valid %>%
group_by(EUNISa_1) %>%
summarize(
Q1_NDVI_max = quantile(NDVI_max, 0.25, na.rm = T),
Q1_NDMI_min = quantile(NDMI_min, 0.25, na.rm = T),
Q3_NDVI_max = quantile(NDVI_max, 0.75, na.rm = T),
Q3_NDMI_min = quantile(NDMI_min, 0.75, na.rm = T),
IQR_NDVI_max = IQR(NDVI_max, na.rm = TRUE),
IQR_NDMI_min = IQR(NDMI_min, na.rm = TRUE)
)
# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
left_join(IQ_ranges, by = "EUNISa_1")
# Filter rows within the IQR range for both variables
filtered_data2 <- all_GPS_valid %>%
filter(
(NDVI_max >= Q1_NDVI_max & NDVI_max <= Q3_NDVI_max) &
(NDMI_min >= Q1_NDMI_min & NDMI_min <= Q3_NDMI_min)
) %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices2 <- sample(1:nrow(filtered_data2), 0.7 * nrow(filtered_data2))
train_data2 <- filtered_data2[train_indices2, ]
test_data2 <- filtered_data2[-train_indices2, ]
Number of points per category for filtered data:
filtered_data2 %>% count(EUNISa_1)
rf_cforest2 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data2,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest2 <- predict(rf_cforest2, newdata = test_data2,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest2, test_data2$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 325 6 0 0
R 45 870 0 0
S 0 0 14 2
T 0 0 2 23
Overall Statistics
Accuracy : 0.9573
95% CI : (0.9447, 0.9676)
No Information Rate : 0.6807
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.9032
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.8784 0.9932 0.87500 0.92000
Specificity 0.9935 0.8905 0.99843 0.99842
Pos Pred Value 0.9819 0.9508 0.87500 0.92000
Neg Pred Value 0.9529 0.9839 0.99843 0.99842
Prevalence 0.2875 0.6807 0.01243 0.01943
Detection Rate 0.2525 0.6760 0.01088 0.01787
Detection Prevalence 0.2572 0.7110 0.01243 0.01943
Balanced Accuracy 0.9359 0.9418 0.93671 0.95921
varimp_rf_cforest2 <- party::varimp(rf_cforest2, conditional = F)
Variable Importance Plot
varimp_rf_cforest2_df <- data.frame(Variable = names(varimp_rf_cforest2),
Importance = varimp_rf_cforest2)
ggplot(varimp_rf_cforest2_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest1, newdata = test_data1, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data1$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc2 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc2
Filter the data to get only GPS-points within 1.5 * IQ range of NDVI_max and NDMI_min.
# Filter rows within the 1.5 * IQR range for both variables
filtered_data3 <- all_GPS_valid %>%
filter(
(NDVI_max >= (Q1_NDVI_max - 1.5 * IQR_NDVI_max) & NDVI_max <= (Q3_NDVI_max + 1.5 * IQR_NDVI_max)) &
(NDMI_min >= (Q1_NDMI_min - 1.5 * IQR_NDMI_min) & NDMI_min <= (Q3_NDMI_min + 1.5 * IQR_NDMI_min))
) %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices3 <- sample(1:nrow(filtered_data3), 0.7 * nrow(filtered_data3))
train_data3 <- filtered_data3[train_indices3, ]
test_data3 <- filtered_data3[-train_indices3, ]
Number of points per category for filtered data:
filtered_data3 %>% count(EUNISa_1)
rf_cforest3 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data3,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest3 <- predict(rf_cforest3, newdata = test_data3,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest3, test_data3$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 660 355 0 0
R 552 2132 0 0
S 0 0 48 7
T 0 0 21 96
Overall Statistics
Accuracy : 0.7585
95% CI : (0.7447, 0.7719)
No Information Rate : 0.6425
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.4876
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.5446 0.8573 0.69565 0.93204
Specificity 0.8665 0.6012 0.99816 0.99443
Pos Pred Value 0.6502 0.7943 0.87273 0.82051
Neg Pred Value 0.8067 0.7009 0.99450 0.99814
Prevalence 0.3131 0.6425 0.01782 0.02661
Detection Rate 0.1705 0.5508 0.01240 0.02480
Detection Prevalence 0.2622 0.6934 0.01421 0.03022
Balanced Accuracy 0.7055 0.7292 0.84691 0.96323
varimp_rf_cforest3 <- party::varimp(rf_cforest3, conditional = F)
Variable Importance Plot
varimp_rf_cforest3_df <- data.frame(Variable = names(varimp_rf_cforest3),
Importance = varimp_rf_cforest3)
ggplot(varimp_rf_cforest3_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest1, newdata = test_data1, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data1$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc3 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc3
Filter the data to get only GPS-points within mean +/- SD of NDVI_max and NDMI_min.
mean_sd <- all_GPS_valid %>%
group_by(EUNISa_1) %>%
summarize(
mean_NDVI_max = mean(all_GPS_valid$NDVI_max, na.rm = T),
mean_NDMI_min = mean(all_GPS_valid$NDMI_min, na.rm = T),
sd_NDVI_max = sd(all_GPS_valid$NDVI_max, na.rm = T),
sd_NDMI_min = sd(all_GPS_valid$NDMI_min, na.rm = T)
)
# Join the IQ ranges back to the original data
all_GPS_valid <- all_GPS_valid %>%
left_join(mean_sd, by = "EUNISa_1")
# Filter rows within the specified range for both variables
filtered_data4 <- all_GPS_valid %>%
filter(
(NDVI_max >= (mean_NDVI_max - sd_NDVI_max) & NDVI_max <= (mean_NDVI_max + sd_NDVI_max)) &
(NDMI_min >= (mean_NDMI_min - sd_NDMI_min) & NDMI_min <= (mean_NDMI_min + sd_NDMI_min))
) %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices4 <- sample(1:nrow(filtered_data4), 0.7 * nrow(filtered_data4))
train_data4 <- filtered_data4[train_indices4, ]
test_data4 <- filtered_data4[-train_indices4, ]
Number of points per category for filtered data:
filtered_data4 %>% count(EUNISa_1)
rf_cforest4 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data4,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest4 <- predict(rf_cforest4, newdata = test_data4,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest4, test_data4$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 281 182 0 0
R 485 1233 0 0
S 0 0 27 9
T 0 0 17 60
Overall Statistics
Accuracy : 0.6979
95% CI : (0.6787, 0.7167)
No Information Rate : 0.6168
P-Value [Acc > NIR] : 2.873e-16
Kappa : 0.3564
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.3668 0.8714 0.61364 0.86957
Specificity 0.8809 0.4482 0.99600 0.99236
Pos Pred Value 0.6069 0.7177 0.75000 0.77922
Neg Pred Value 0.7351 0.6840 0.99247 0.99594
Prevalence 0.3339 0.6168 0.01918 0.03008
Detection Rate 0.1225 0.5375 0.01177 0.02616
Detection Prevalence 0.2018 0.7489 0.01569 0.03357
Balanced Accuracy 0.6239 0.6598 0.80482 0.93096
varimp_rf_cforest4 <- party::varimp(rf_cforest4, conditional = F)
Variable Importance Plot
varimp_rf_cforest4_df <- data.frame(Variable = names(varimp_rf_cforest4),
Importance = varimp_rf_cforest4)
ggplot(varimp_rf_cforest4_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest1, newdata = test_data1, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data1$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc4 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc4
Filter the data to get only GPS-points above p20 and below p80 of NDVI_max and NDMI_min.
# Filter rows above the 20th percentile and below the 80th percentile for both variables
filtered_data5 <- all_GPS_valid %>%
filter(
(NDVI_max >= percentile_20_NDVI_max & NDVI_max <= percentile_80_NDVI_max) &
(NDMI_min >= percentile_20_NDMI_min & NDMI_min <= percentile_80_NDMI_min)
) %>%
filter(!is.na(NDVI_max) & !is.na(NDMI_max) & !is.na(NDWI_max) &
!is.na(SAVI_max) & !is.na(EVI_max) & !is.na(NDVI_min) &
!is.na(NDMI_min) & !is.na(NDWI_min) & !is.na(SAVI_min) &
!is.na(EVI_min)) %>%
mutate(EUNISa_1 = as.factor(EUNISa_1)) %>%
filter(EVI_max <= 1 & EVI_min >= -1)
Split into training and test data sets.
train_indices5 <- sample(1:nrow(filtered_data5), 0.7 * nrow(filtered_data5))
train_data5 <- filtered_data5[train_indices5, ]
test_data5 <- filtered_data5[-train_indices5, ]
Number of points per category for filtered data:
filtered_data5 %>% count(EUNISa_1)
rf_cforest5 <- party::cforest(EUNISa_1 ~ NDVI_max + NDVI_min + NDMI_max +
NDMI_min + NDWI_max + NDWI_min + EVI_max +
EVI_min + SAVI_max + SAVI_min + canopy_height,
data = train_data5,
controls = cforest_control(
mtry = 3,
# mtry = sqrt(11)
# Default mtry = 5
# Bagging: mtry = NULL
# or = number of input variables
ntree = 500) # Default, try increasing
)
predictions_rf_cforest5 <- predict(rf_cforest5, newdata = test_data5,
OOB = TRUE, type = "response")
Confusion matrix:
confusionMatrix(predictions_rf_cforest5, test_data5$EUNISa_1)
Confusion Matrix and Statistics
Reference
Prediction Q R S T
Q 389 38 0 0
R 139 1120 0 0
S 0 0 17 3
T 0 0 8 32
Overall Statistics
Accuracy : 0.8923
95% CI : (0.8768, 0.9065)
No Information Rate : 0.6632
P-Value [Acc > NIR] : < 2.2e-16
Kappa : 0.7592
Mcnemar's Test P-Value : NA
Statistics by Class:
Class: Q Class: R Class: S Class: T
Sensitivity 0.7367 0.9672 0.680000 0.91429
Specificity 0.9688 0.7636 0.998257 0.99532
Pos Pred Value 0.9110 0.8896 0.850000 0.80000
Neg Pred Value 0.8946 0.9220 0.995365 0.99824
Prevalence 0.3024 0.6632 0.014318 0.02005
Detection Rate 0.2228 0.6415 0.009737 0.01833
Detection Prevalence 0.2446 0.7211 0.011455 0.02291
Balanced Accuracy 0.8528 0.8654 0.839128 0.95481
varimp_rf_cforest5 <- party::varimp(rf_cforest5, conditional = F)
Variable Importance Plot
varimp_rf_cforest5_df <- data.frame(Variable = names(varimp_rf_cforest5),
Importance = varimp_rf_cforest5)
ggplot(varimp_rf_cforest5_df,
aes(x = reorder(Variable, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = "lightblue") +
coord_flip() + theme_minimal() +
labs(title = "Variable Importance", x = "Variables", y = "Importance")
ROC curves:
# Predict probabilities for each class
probabilities <- predict(rf_cforest1, newdata = test_data1, type = "prob")
# Step 1: Convert list of matrices to a proper data frame
prob_matrix <- t(sapply(probabilities, as.vector))
colnames(prob_matrix) <- c("Q", "R", "S", "T") # Adjust if needed
prob_df <- as.data.frame(prob_matrix)
# Step 2: Prepare actual class labels
actual <- factor(test_data1$EUNISa_1, levels = c("Q", "R", "S", "T"))
classes <- levels(actual)
# Step 3: Binarize actual labels
actual_bin <- model.matrix(~ actual - 1)
colnames(actual_bin) <- gsub("actual", "", colnames(actual_bin))
# Step 4: Compute ROC data for each class with AUC in label
roc_data <- lapply(classes, function(class) {
roc_obj <- roc(actual_bin[, class], prob_df[[class]])
auc_val <- round(auc(roc_obj), 3)
data.frame(
FPR = rev(roc_obj$specificities),
TPR = rev(roc_obj$sensitivities),
Class = paste0(class, " (AUC = ", auc_val, ")")
)
}) %>% bind_rows()
G3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
gG3;Setting levels: control = 0, case = 1
gG3;Setting direction: controls < cases
g
# Step 5: Plot ROC curves with ggplot2
roc5 <- ggplot(roc_data, aes(x = FPR, y = TPR, color = Class)) +
geom_line(size = 1.2) +
geom_abline(linetype = "dashed", color = "gray") +
labs(
title = "Multiclass ROC Curves with AUC",
x = "False Positive Rate",
y = "True Positive Rate",
color = "Class (AUC)"
) +
theme_minimal() +
theme(legend.position = "bottom")
roc5
AlpineGrasslands_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrassland_Sentinel_Plot_Allyear_Allmetrics.csv")
Rows: 40 Columns: 69
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): system:index, Codigo, H�bitat, Localidad, Precisi�n, Subregion, UTM, s...
dbl (60): Altitude__, Altura_med, Altura_m�, Cobertur_1, Cobertur_2, Cobertura, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
Rows: 40 Columns: 34
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): system:index, Codigo, H�bitat, Localidad, Precisi�n, Subregion, UTM, .geo
dbl (26): Altitude__, Altura_med, Altura_m�, Cobertur_1, Cobertur_2, Cobertura, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/AlpineGrasslands/AlpineGrasslands_CanopyHeight_1m.csv")
Rows: 40 Columns: 27
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): system:index, Codigo, Hábitat, Localidad, Precisión, Subregion, UTM, .geo
dbl (19): Altitude__, Altura_med, Altura_má, Cobertur_1, Cobertur_2, Cobertura, ...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_indices <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Sentinel_Plot_AllYear_Allmetrics.csv")
Rows: 32 Columns: 54
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (8): system:index, EUNIS, ID, TYPE, Xo, Yo, source, .geo
dbl (45): EVI_max, EVI_median, EVI_min, EVI_mode, EVI_p10, EVI_p90, EVI_stdDev,...
date (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_phen <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_Phenology_SOS_EOS_Peak_NDVI_Amplitude.csv")
Rows: 32 Columns: 19
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): system:index, EUNIS, ID, TYPE, Xo, Yo, .geo
dbl (11): EOS_DOY, NDVI_Amplitude, NDVI_at_EOS, NDVI_at_Peak, NDVI_at_SOS, Peak...
date (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
VegetationTypes_CH <- read_csv(
"C:/Data/MOTIVATE/Cordillera/VegetationTypes/VegetationTypes_CanopyHeight_1m.csv")
Rows: 32 Columns: 12
── Column specification ────────────────────────────────────────────────────────────
Delimiter: ","
chr (7): system:index, EUNIS, ID, TYPE, Xo, Yo, .geo
dbl (4): X, Y, canopy_height, evaluated
date (1): DATE
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
AlpineGrasslands <- AlpineGrasslands_indices %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat") %>%
full_join(AlpineGrasslands_phen %>%
select(-`system:index`, -.geo, -Localidad) %>%
rename(Hábitat = "H�bitat")) %>%
full_join(AlpineGrasslands_CH %>%
select(-`system:index`, -.geo, -Localidad)) %>%
select(-Date__year, - `Precisi�n`) %>%
mutate(DATE = ymd(DATE)) %>%
rename(ID = "Releve_num") %>%
mutate(ID = as.character(ID)) %>%
mutate(layer = "AlpineGrasslands")
Joining with `by = join_by(Altitude__, Altura_med, `Altura_m�`, Cobertur_1,
Cobertur_2, Cobertura, Codigo, DATE, Gps_error, Hgtavg_hl, Hgtmax_hl, Hábitat,
Latitude, Longitude, Num_specie, Orientaci, Pendiente, `Precisi�n`, Releve_num,
Subregion, UTM, X, Y)`
Joining with `by = join_by(Altitude__, Altura_med, Cobertur_1, Cobertur_2,
Cobertura, Codigo, Gps_error, Hgtavg_hl, Hgtmax_hl, Hábitat, Latitude, Longitude,
Num_specie, Orientaci, Pendiente, Releve_num, Subregion, UTM, X, Y)`
VegetationTypes <- VegetationTypes_indices %>%
select(-`system:index`, -.geo) %>%
full_join(VegetationTypes_phen %>%
select(-`system:index`, -.geo)) %>%
full_join(VegetationTypes_CH %>%
select(-`system:index`, -.geo)) %>%
rename(Hábitat = "TYPE") %>%
mutate(layer = "VegetationTypes")
Joining with `by = join_by(DATE, EUNIS, ID, TYPE, X, Xo, Y, Yo, evaluated)`
Joining with `by = join_by(DATE, EUNIS, ID, TYPE, X, Xo, Y, Yo, evaluated)`
Merge both datasets:
cordillera <- bind_rows(
AlpineGrasslands %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer"),
VegetationTypes %>% select(DATE, ID, starts_with("NDMI"),
starts_with("NDVI"), Hábitat, "EOS_DOY",
"Peak_DOY", "SOS_DOY", "Season_Length",
"canopy_height", "layer")
) %>%
mutate(EUNISa_1 = case_when(
Hábitat = str_detect(Hábitat, "Pastizal|Cervunal|grassland|meadow") ~ "R",
Hábitat = str_detect(Hábitat, "forest") ~ "T",
Hábitat = str_detect(Hábitat, "Scrub|scrub|Shrubland|shrubland|shrub|Heathland") ~ "S",
Hábitat = str_detect(Hábitat, "Suelo|Scree|scree|cliff") ~ "U",
Hábitat = is.na(Hábitat) ~ "R",
TRUE ~ NA_character_),
EUNISa_1_descr = case_when(
EUNISa_1 == "R" ~ "Grasslands",
EUNISa_1 == "T" ~ "Forests and other wooded land",
EUNISa_1 == "S" ~ "Heathlands, scrub and tundra",
EUNISa_1 == "U" ~ "Inland habitats with no or little soil")
)
distr_plot(cordillera,
c("NDVI_max", "NDVI_p90", "NDVI_min", "NDVI_p10"),
c("NDVI max", "NDVI p90", "NDVI min", "NDVI p10"))
distr_plot(cordillera,
c("NDMI_max", "NDMI_p90", "NDMI_min", "NDMI_p10"),
c("NDMI max", "NDMI p90", "NDMI min", "NDMI p10"))
sessionInfo()
R version 4.5.0 (2025-04-11 ucrt)
Platform: x86_64-w64-mingw32/x64
Running under: Windows 11 x64 (build 26100)
Matrix products: default
LAPACK version 3.12.1
locale:
[1] LC_COLLATE=Spanish_Spain.utf8 LC_CTYPE=Spanish_Spain.utf8
[3] LC_MONETARY=Spanish_Spain.utf8 LC_NUMERIC=C
[5] LC_TIME=Spanish_Spain.utf8
time zone: Europe/Madrid
tzcode source: internal
attached base packages:
[1] stats4 grid stats graphics grDevices utils datasets methods
[9] base
other attached packages:
[1] pROC_1.18.5 randomForest_4.7-1.2 moreparty_0.4
[4] caret_7.0-1 lattice_0.22-6 ggparty_1.0.0
[7] partykit_1.2-24 libcoin_1.0-10 party_1.3-18
[10] strucchange_1.5-4 sandwich_3.1-1 zoo_1.8-14
[13] modeltools_0.2-24 mvtnorm_1.3-3 ggeffects_2.2.1
[16] car_3.1-3 carData_3.0-5 lmerTest_3.1-3
[19] lme4_1.1-37 Matrix_1.7-3 dtplyr_1.3.1
[22] rnaturalearth_1.0.1 sf_1.0-20 scales_1.4.0
[25] readxl_1.4.5 gridExtra_2.3 here_1.0.1
[28] lubridate_1.9.4 forcats_1.0.0 stringr_1.5.1
[31] dplyr_1.1.4 purrr_1.0.4 readr_2.1.5
[34] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.2
[37] tidyverse_2.0.0
loaded via a namespace (and not attached):
[1] RColorBrewer_1.1-3 rstudioapi_0.17.1 jsonlite_2.0.0
[4] magrittr_2.0.3 TH.data_1.1-3 farver_2.1.2
[7] nloptr_2.2.1 rmarkdown_2.29 vctrs_0.6.5
[10] minqa_1.2.8 terra_1.8-50 htmltools_0.5.8.1
[13] varImp_0.4 cellranger_1.1.0 Formula_1.2-5
[16] sass_0.4.10 parallelly_1.44.0 bslib_0.9.0
[19] KernSmooth_2.23-26 htmlwidgets_1.6.4 plyr_1.8.9
[22] cachem_1.1.0 mime_0.13 lifecycle_1.0.4
[25] iterators_1.0.14 pkgconfig_2.0.3 R6_2.6.1
[28] fastmap_1.2.0 shiny_1.10.0 rbibutils_2.3
[31] future_1.49.0 digest_0.6.37 numDeriv_2016.8-1.1
[34] rprojroot_2.0.4 labeling_0.4.3 timechange_0.3.0
[37] httr_1.4.7 abind_1.4-8 compiler_4.5.0
[40] proxy_0.4-27 bit64_4.6.0-1 withr_3.0.2
[43] backports_1.5.0 DBI_1.2.3 MASS_7.3-65
[46] lava_1.8.1 classInt_0.4-11 ModelMetrics_1.2.2.2
[49] tools_4.5.0 units_0.8-7 httpuv_1.6.16
[52] future.apply_1.11.3 nnet_7.3-20 rnaturalearthdata_1.0.0
[55] glue_1.8.0 promises_1.3.2 nlme_3.1-168
[58] inum_1.0-5 checkmate_2.3.2 reshape2_1.4.4
[61] generics_0.1.4 recipes_1.3.0 gtable_0.3.6
[64] tzdb_0.5.0 class_7.3-23 data.table_1.17.2
[67] hms_1.1.3 utf8_1.2.5 coin_1.4-3
[70] foreach_1.5.2 pillar_1.10.2 vroom_1.6.5
[73] later_1.4.2 splines_4.5.0 bit_4.6.0
[76] survival_3.8-3 tidyselect_1.2.1 knitr_1.50
[79] reformulas_0.4.1 xfun_0.52 measures_0.3
[82] hardhat_1.4.1 timeDate_4041.110 matrixStats_1.5.0
[85] DT_0.33 phosphoricons_0.2.1 stringi_1.8.7
[88] yaml_2.3.10 boot_1.3-31 shinyWidgets_0.9.0
[91] evaluate_1.0.3 codetools_0.2-20 cli_3.6.5
[94] rpart_4.1.24 xtable_1.8-4 Rdpack_2.6.4
[97] jquerylib_0.1.4 Rcpp_1.0.14 globals_0.18.0
[100] parallel_4.5.0 rclipboard_0.2.1 gower_1.0.2
[103] listenv_0.9.1 viridisLite_0.4.2 ipred_0.9-15
[106] prodlim_2025.04.28 e1071_1.7-16 crayon_1.5.3
[109] insight_1.2.0 rlang_1.1.6 multcomp_1.4-28